package edu.berkeley.nlp.treebank;

import java.io.Serializable;

import edu.berkeley.nlp.tokenizer.TokenizerFactory;
import edu.berkeley.nlp.util.Filter;
import edu.berkeley.nlp.util.Filters;

/**
 * Language pack for Chinese treebank. (Look into using native2ascii to edit
 * this file as a GB file)
 * 
 * @author Roger Levy
 */

public class ChineseTreebankLanguagePack extends AbstractTreebankLanguagePack
		implements Serializable {

	private static TokenizerFactory tf;

	public static void setTokenizerFactory(TokenizerFactory tf) {
		ChineseTreebankLanguagePack.tf = tf;
	}

	public static final String ENCODING = "GB18030";

	/**
	 * Return the input Charset encoding for the Treebank. See documentation for
	 * the <code>Charset</code> class.
	 * 
	 * @return Name of Charset
	 */
	@Override
	public String getEncoding() {
		return ENCODING;
	}

	/**
	 * Accepts a String that is a punctuation tag name, and rejects everything
	 * else.
	 * 
	 * @return Whether this is a punctuation tag
	 */
	@Override
	public boolean isPunctuationTag(String str) {
		return str.equals("PU");
	}

	/**
	 * Accepts a String that is a punctuation word, and rejects everything else.
	 * If one can't tell for sure (as for ' in the Penn Treebank), it maks the
	 * best guess that it can.
	 * 
	 * @return Whether this is a punctuation word
	 */
	@Override
	public boolean isPunctuationWord(String str) {
		return chineseCommaAcceptFilter().accept(str)
				|| chineseEndSentenceAcceptFilter().accept(str)
				|| chineseDouHaoAcceptFilter().accept(str)
				|| chineseQuoteMarkAcceptFilter().accept(str)
				|| chineseParenthesisAcceptFilter().accept(str)
				|| chineseColonAcceptFilter().accept(str)
				|| chineseDashAcceptFilter().accept(str)
				|| chineseOtherAcceptFilter().accept(str);

	}

	/**
	 * Accepts a String that is a sentence end punctuation tag, and rejects
	 * everything else.
	 * 
	 * @return Whether this is a sentence final punctuation tag
	 */
	@Override
	public boolean isSentenceFinalPunctuationTag(String str) {
		return chineseEndSentenceAcceptFilter().accept(str);
	}

	/**
	 * Returns a String array of punctuation tags for this treebank/language.
	 * 
	 * @return The punctuation tags
	 */
	@Override
	public String[] punctuationTags() {
		return tags;
	}

	/**
	 * Returns a String array of punctuation words for this treebank/language.
	 * 
	 * @return The punctuation words
	 */
	@Override
	public String[] punctuationWords() {
		return punctWords;
	}

	/**
	 * Returns a String array of sentence final punctuation tags for this
	 * treebank/language.
	 * 
	 * @return The sentence final punctuation tags
	 */
	@Override
	public String[] sentenceFinalPunctuationTags() {
		return tags;
	}

	/**
	 * Returns a String array of sentence final punctuation words for this
	 * treebank/language.
	 * 
	 * @return The sentence final punctuation tags
	 */
	public String[] sentenceFinalPunctuationWords() {
		return endSentence;
	}

	/**
	 * Accepts a String that is a punctuation tag that should be ignored by
	 * EVALB-style evaluation, and rejects everything else. Traditionally, EVALB
	 * has ignored a subset of the total set of punctuation tags in the English
	 * Penn Treebank (quotes and period, comma, colon, etc., but not brackets)
	 * 
	 * @return Whether this is a EVALB-ignored punctuation tag
	 */
	@Override
	public boolean isEvalBIgnoredPunctuationTag(String str) {
		return Filters.collectionAcceptFilter(tags).accept(str);
	}

	/**
	 * The first 3 are used by the Penn Treebank; # is used by the BLLIP corpus,
	 * and ^ and ~ are used by Klein's lexparser. Identical to
	 * PennTreebankLanguagePack.
	 */
	private static final char[] annotationIntroducingChars = { '-', '=', '|',
			'#', '^', '~' };

	/**
	 * Return an array of characters at which a String should be truncated to
	 * give the basic syntactic category of a label. The idea here is that Penn
	 * treebank style labels follow a syntactic category with various functional
	 * and crossreferencing information introduced by special characters (such
	 * as "NP-SBJ=1"). This would be truncated to "NP" by the array containing
	 * '-' and "=".
	 * 
	 * @return An array of characters that set off label name suffixes
	 */
	@Override
	public char[] labelAnnotationIntroducingCharacters() {
		return annotationIntroducingChars;
	}

	/**
	 * This is valid for "BobChrisTreeNormalizer" conventions only. Again,
	 * identical to PennTreebankLanguagePack.
	 */
	private static final String[] startSymbols = { "ROOT" };

	/**
	 * Returns a String array of treebank start symbols.
	 * 
	 * @return The start symbols
	 */
	@Override
	public String[] startSymbols() {
		return startSymbols;
	}

	private static final String[] tags = { "PU" };
	private static final String[] comma = { ",", "\uff0c", "\u3000" }; // \u3000
																		// is an
																		// "ideographic space"...?
	private static final String[] endSentence = { "\u3002", "\uff0e", "\uff01",
			"\uff1f", "?", "!", "." };
	private static final String[] douHao = { "\u3001" };
	private static final String[] quoteMark = { "\u201c", "\u201d", "\u2018",
			"\u2019", "\u300a", "\u300b", "\u300e", "\u300f", "\u3008",
			"\u3009", "\u300c", "\u300d", "\uff02", "\uff1c", "\uff1e", "`",
			"\uff07" };
	private static final String[] parenthesis = { "\uff08", "\uff09", "-LRB-",
			"-RRB-", "\u3010", "\u3011" };
	private static final String[] colon = { "\uff1a", "\uff1b", "\u2236", ":" };
	private static final String[] dash = { "\u2026", "\u2014", "\u2014\u2014",
			"\u2014\u2014\u2014", "\uff0d", "\uff0d\uff0d", "\u2500\u2500",
			"\u2501", "\u2501\u2501", "\u2014\uff0d", "-", "----", "~",
			"\u2026\u2026", "\uff5e" };
	private static final String[] other = { "\u00b7", "\uff0f", "\uff0f",
			"\uff0a", "\uff06", "/", "//", "*" }; // slashes are used in urls

	private static String[] leftQuoteMark = { "\u201c", "\u2018", "\u300a",
			"\u300e", "\u3008", "\u300c", "\uff1c", "`" };
	private static String[] rightQuoteMark = { "\u201d", "\u2019", "\u300b",
			"\u300f", "\u3009", "\u300d", "\uff1e", "\uff07" };
	private static String[] leftParenthesis = { "\uff08", "-LRB-", "\u3010" };
	private static String[] rightParenthesis = { "\uff09", "-RRB-", "\u3011" };

	private static final String[] punctWords;

	static {
		int n = tags.length + comma.length + endSentence.length + douHao.length
				+ quoteMark.length + parenthesis.length + colon.length
				+ dash.length + other.length;
		punctWords = new String[n];
		int m = 0;
		System.arraycopy(tags, 0, punctWords, m, tags.length);
		m += tags.length;
		System.arraycopy(comma, 0, punctWords, m, comma.length);
		m += comma.length;
		System.arraycopy(endSentence, 0, punctWords, m, endSentence.length);
		m += endSentence.length;
		System.arraycopy(douHao, 0, punctWords, m, douHao.length);
		m += douHao.length;
		System.arraycopy(quoteMark, 0, punctWords, m, quoteMark.length);
		m += quoteMark.length;
		System.arraycopy(parenthesis, 0, punctWords, m, parenthesis.length);
		m += parenthesis.length;
		System.arraycopy(colon, 0, punctWords, m, colon.length);
		m += colon.length;
		System.arraycopy(dash, 0, punctWords, m, dash.length);
		m += dash.length;
		System.arraycopy(other, 0, punctWords, m, other.length);
	}

	public static Filter<String> chineseCommaAcceptFilter() {
		return Filters.collectionAcceptFilter(comma);
	}

	public static Filter<String> chineseEndSentenceAcceptFilter() {
		return Filters.collectionAcceptFilter(endSentence);
	}

	public static Filter<String> chineseDouHaoAcceptFilter() {
		return Filters.collectionAcceptFilter(douHao);
	}

	public static Filter<String> chineseQuoteMarkAcceptFilter() {
		return Filters.collectionAcceptFilter(quoteMark);
	}

	public static Filter<String> chineseParenthesisAcceptFilter() {
		return Filters.collectionAcceptFilter(parenthesis);
	}

	public static Filter<String> chineseColonAcceptFilter() {
		return Filters.collectionAcceptFilter(colon);
	}

	public static Filter<String> chineseDashAcceptFilter() {
		return Filters.collectionAcceptFilter(dash);
	}

	public static Filter<String> chineseOtherAcceptFilter() {
		return Filters.collectionAcceptFilter(other);
	}

	public static Filter<String> chineseLeftParenthesisAcceptFilter() {
		return Filters.collectionAcceptFilter(leftParenthesis);
	}

	public static Filter<String> chineseRightParenthesisAcceptFilter() {
		return Filters.collectionAcceptFilter(rightParenthesis);
	}

	public static Filter<String> chineseLeftQuoteMarkAcceptFilter() {
		return Filters.collectionAcceptFilter(leftQuoteMark);
	}

	public static Filter<String> chineseRightQuoteMarkAcceptFilter() {
		return Filters.collectionAcceptFilter(rightQuoteMark);
	}

	/**
	 * Returns the extension of treebank files for this treebank. This is "fid".
	 */
	public String treebankFileExtension() {
		return "fid";
	}

	private static final long serialVersionUID = 5757403475523638802L;

}
